Reproducible analysis of the Indian dataset (IND)
library(knitr)
library(genio)
library(ggtree)
library(popkin)
library(BEDMatrix)
library(seriation)
library(superadmixture)
library(tidyverse)
# for plotting
library(ggplot2)
library(grid)
library(gridGraphics)
library(ggplotify)
library(ggpubr)
library(cowplot)
library(patchwork)
library(latex2exp)1 Data Preprocessing
In this document, we demonstrate our procedures for analysis of merged dataset of the mainland Indians from the IND study with the Central/South Asia and the East Asia populations from HGDP to study. We obtained the assembled genotypes India367.{bed,bim,fam} of IND study from Dr. Analabha Basu Basu et al.. The raw HGDP is available at the webpage, attributed to Bergström et al. (2020). Here we start with the assembled version of HGDP, provided by the PLINK. PLINK hosts the assembled data at the following dropbox link:
.pgenfile: https://www.dropbox.com/s/hppj1g1gzygcocq/hgdp_all.pgen.zst?dl=1.pvarfile: https://www.dropbox.com/s/1mmkq0bd9ax8rng/hgdp_all.pvar.zst?dl=1.psamfile: https://www.dropbox.com/s/0zg57558fqpj3w1/hgdp.psam?dl=1
The associated annotations can be found at the FTP site: ftp://ngs.sanger.ac.uk/production/hgdp/hgdp_wgs.20190516/metadata/hgdp_wgs.20190516.metadata.txt
We use PLINK, PLINK2 and zstd for data preprocessing. The PLINK software can be downloaded from this webpage. The PLINK2 software can be downloaded from this webpage. The ztsd is hosted at the github, and can downloaded and installed through the following commands.
We download and unzip the assembled HGDP data from the dropbox stated above. We create a folder called rawdata to host the downloaded raw data and pre-processed data.
mkdir -p rawdata && cd $_
zstd=<PATH_TO_ZSTD_EXECUTABLE>
## download `.pgen` file
wget "https://www.dropbox.com/s/hppj1g1gzygcocq/hgdp_all.pgen.zst?dl=1"
mv "hgdp_all.pgen.zst?dl=1" "hgdp_all.pgen.zst"
./$zstd -d "hgdp_all.pgen.zst"
rm "hgdp_all.pgen.zst"
## download `.pvar` file
wget "https://www.dropbox.com/s/1mmkq0bd9ax8rng/hgdp_all.pvar.zst?dl=1"
mv "hgdp_all.pvar.zst?dl=1" "hgdp_all.pvar.zst"
## download `.psam` file
wget "https://www.dropbox.com/s/0zg57558fqpj3w1/hgdp.psam?dl=1"
mv "hgdp.psam?dl=1" "hgdp_all.psam"
## download metadata
wget "ftp://ngs.sanger.ac.uk/production/hgdp/hgdp_wgs.20190516/metadata/hgdp_wgs.20190516.metadata.txt"This step serves set missing IDs to unique values to avoid these being detected as repeated IDs
PLINK2=<PATH_TO_PLINK2_EXECUTABLE>
./$PLINK2 --pfile hgdp_all vzs \
--set-missing-var-ids '@:#' \
--make-just-pvar zs \
--out hgdp_all_uniq
# replace data
mv hgdp_all_uniq.pvar.zst hgdp_all.pvar.zst
# trash
rm hgdp_all_uniq.logThis step serves to convert files from zvs format to PLINK binary format.
./$PLINK2 \
--pfile hgdp_all vzs \
--var-filter \
--snps-only just-acgt \
--max-alleles 2 \
--make-bed \
--out hgdp_wgs_autosomesThen we add subpopulation labels to FAM files.
awk -F" " '{
if (NR == FNR) {
population[$1]=$6;
if ($10 == "M") {
gender[$1]=1;
} else if ($10 == "F") {
gender[$1]=2;
} else {
gender[$1]=0;
}
}
else print population[$2],$2,$3,$4,gender[$2],$6;
}' hgdp_wgs.20190516.metadata.txt hgdp_wgs_autosomes.fam > hgdp_wgs_autosomes.fam.NEW
mv hgdp_wgs_autosomes.fam.NEW hgdp_wgs_autosomes.famThis step serves to combine India367.{bed,bim,fam} with hgdp_wgs_autosomes.{bed,bim,fam}.
PLINK=<PATH_TO_PLINK_EXECUTABLE>
# get the SNP id
awk '{print $2}' India367.bim > India367_snps.txt
# extract the portion of genetic data that overlap with India367
./$PLINK --bfile hgdp_wgs_autosomes \
--extract India367_snps.txt \
--out hgdp_overlap \
--allow-extra-chr \
--make-bed \
--geno 0.01 \
--mind 0.01 \
--maf 0.01
awk '{print $2}' hgdp_overlap.bim > hgdp_snps.txt
./$PLINK --bfile India367 \
--extract hgdp_snps.txt \
--out India367_overlap \
--allow-extra-chr \
--make-bed \
--geno 0.01 \
--mind 0.01 \
--maf 0.01
# merge two bfiles
./$PLINK --bfile hgdp_overlap \
--bmerge India367_overlap \
--make-bed \
--out India367_final
./$PLINK --bfile hgdp_overlap --exclude India367_final-merge.missnp --make-bed --out hgdp_overlap-tmp
./$PLINK --bfile India367_overlap --exclude India367_final-merge.missnp --make-bed --out India367_overlap-tmp
./$PLINK --bfile hgdp_overlap-tmp \
--bmerge India367_overlap-tmp \
--make-bed \
--geno 0.01 \
--maf 0.01 \
--out India367_finalThis code chunk cleans up the rawdata folder and removed the temporary files.
2 Estimating individual-level coancestry
In the following sections, the intermediate data will be stored at the rdata folder.
X <- BEDMatrix::BEDMatrix("rawdata/India367_final", simple_names = TRUE)
fam <- genio::read_fam("rawdata/India367_final")
info <- readr::read_tsv( "rawdata/hgdp_wgs.20190516.metadata.txt", col_types = 'ccccccddccddddd') %>%
dplyr::select(c('sample', 'population', 'latitude', 'longitude', 'region'))This code chunk cleans up the population annotation files.
fam <- dplyr::left_join(fam, info, by = c("id" = "sample")) %>%
dplyr::rename("subpop" = "region") %>%
dplyr::mutate(subpop = ifelse(subpop == "AFRICA", "Africa",
ifelse(subpop == "MIDDLE_EAST", "MiddleEast",
ifelse(subpop == "EUROPE", "Europe",
ifelse(subpop == "CENTRAL_SOUTH_ASIA", "CSAsia",
ifelse(subpop == "EAST_ASIA", "EAsia",
ifelse(subpop == "AMERICA", "Americas", "Oceania")))))))
# label population for Indians
fam$population <- sapply(seq_along(fam$population), function(index) {
population <- fam$population[index]
if (is.na(population)) {
id <- fam$id[index]
return(unlist(stringr::str_split(id, "-|_"))[1])
} else {
return(population)
}
})
fam <- fam %>% dplyr::mutate(population = ifelse(population == "MT", "MRT", population)) %>%
dplyr::mutate(population = ifelse(population == "BR2", "WBR", population)) %>%
dplyr::mutate(population = ifelse(population == "GD", "GND", population)) %>%
dplyr::mutate(population = ifelse(population == "IR", "IYR", population)) %>%
dplyr::mutate(population = ifelse(population == "IL", "IRL", population)) %>%
dplyr::mutate(population = ifelse(population == "JW", "JRW", population)) %>%
dplyr::mutate(population = ifelse(population == "PY", "PNY", population)) %>%
dplyr::mutate(population = ifelse(population == "K", "KSH", population)) %>%
dplyr::mutate(population = ifelse(population == "KO", "KOR", population)) %>%
dplyr::mutate(population = ifelse(population == "TH", "THR", population)) %>%
dplyr::mutate(population = ifelse(population == "SA", "SAN", population)) %>%
dplyr::mutate(population = ifelse(population == "KA", "KDR", population)) %>%
dplyr::mutate(population = ifelse(population == "PL1", "PLN", population)) %>%
dplyr::mutate(population = ifelse(population == "MP", "MPB", population))
# label subpop for Indians
CastePopoluations <- c("KSH", "GBR", "WBR", "MRT", "IYR", "PLN")
Dravidian <- c("KDR", "IRL", "PNY")
Austro <- c("GND", "HO", "SAN", "KOR", "BIR")
TibetoBurman <- c("MPB", "THR", "TRI", "JAM")
JarwaOnge <- c("JRW", "ONG")
fam <- dplyr::mutate(fam, subpop = ifelse(is.na(subpop) & population %in% CastePopoluations, "IE", subpop)) %>%
dplyr::mutate(fam, subpop = ifelse(is.na(subpop) & population %in% Dravidian, "Dravidian", subpop)) %>%
dplyr::mutate(fam, subpop = ifelse(is.na(subpop) & population %in% Austro, "AA", subpop)) %>%
dplyr::mutate(fam, subpop = ifelse(is.na(subpop) & population %in% TibetoBurman, "TB", subpop)) %>%
dplyr::mutate(fam, subpop = ifelse(is.na(subpop) & population %in% JarwaOnge, "J&O", subpop))
# retain only CSAsia, IE, Dravidian, AA, TB and EAsia
subpop_order <- c('CSAsia', 'IE', 'Dravidian', 'AA', 'TB', 'EAsia')
index <- fam$subpop %in% subpop_order
fam <- fam[index, ]
X <- X[index, ]
# reorder individuals
index <- order( match(fam$subpop, subpop_order) )
fam <- fam[index, ]
X <- X[index, ]This code chunk performs a relatively primitive quality control and remove variants according to the following filters:
- We obtain the subset of individuals from HGDP and the subset of the individuals from Basu
- We remove the variants that the difference between
af_hgdpandaf_basuis greater than 0.2. This threshold is intended to tackle with the allele flipping issue arised from merging dataset. - We also remove variants that the
missing_hgdpormissing_basuis greater than0.005. This step is intended to only keep variants that have a fair representation from both dataset.
X_hgdp <- X[ stringr::str_starts(fam$id, "HGDP"), ]
X_basu <- X[!stringr::str_starts(fam$id, "HGDP"), ]
af_hgdp <- colMeans(X_hgdp, na.rm = TRUE) / 2
af_basu <- colMeans(X_basu, na.rm = TRUE) / 2
idx1 <- abs(af_basu - af_hgdp) < 0.2
idx2 <- (colMeans(is.na(X_hgdp)) < 0.005) & (colMeans(is.na(X_basu)) < 0.005)
X <- X[, idx1 & idx2]Here we demonstrate how to estimate the individual-level coancestry \(\boldsymbol{\Theta}\) according to the Ochoa-Storey (OS) method by popkin package. So here we only presents the codes but not running them. We provide the pre-computed coancestry at rdata/coanc_os.rda. It should noted that the popkin function returns the kinship coefficients instead of the coancestry coefficients. Therefore, we use the inbr_diag function in the popkin package to map kinship coefficients \(\phi_{jk}\)’s to coancestry coefficients \(\theta_{jk}\)’s:
\[ \theta_{jk} = \begin{cases} 2\phi_{jk} - 1 & j = k \\ \phi_{jk} &j \neq k \end{cases}. \]
obj <- popkin::popkin_A(t(X))
A_min <- mean(sort(obj$A[lower.tri(obj$A)])[1:100])
kinship_os <- 1 - obj$A / A_min
coanc_os <- popkin::inbr_diag(kinship_os)
coanc_os <- ifelse(coanc_os < 0, 0, coanc_os)
save(coanc_os, file = "rdata/coanc_os.rda")
save(fam, file = "rdata/fam.rda")
save(X, file = "rdata/X.rda")We can visualize the individual-level coancestry of the simulated data using plot_popkin function in the popkin function. We use the following helper function plot_colors_subpops to label the sub-populations.
plot_colors_subpops <- function(pops, srt = 0, cex = 0.6, y = FALSE) {
n <- length(pops)
k <- unique(pops)
pops <- factor(pops, levels = unique(pops))
xintercept <- cumsum(table(pops))
breaks <- xintercept - 0.5 * as.numeric(table(pops))
if (y) {
plot(NULL, xlim = c(0, 1), ylim = c(1, n), axes = FALSE, ann = FALSE, xaxs = "i", yaxs = "i")
text(1, n-rev(breaks), rev(unique(pops)), cex = cex, srt = srt, xpd = TRUE, adj = c(1, 0.5))
} else {
plot(NULL, xlim = c(1, n), ylim = c(0, 1), axes = FALSE, ann = FALSE, xaxs = "i", yaxs = "i")
text(breaks, 1, unique(pops), cex = cex, srt = srt, xpd = TRUE, adj = c(1, 0.5))
}
}load("rdata/coanc_os.rda")
load("rdata/fam.rda")
par(mar = c(0, 0, 0, 0) + 0.2)
layout(rbind(c(3, 1, 2), c(3, 1, 5), c(0, 4, 0)), widths = c(0.2, 1, 0.2), heights = c(0.5, 0.5, 0.3))
popkin::plot_popkin(kinship = coanc_os, layout_add = FALSE, leg_cex = 0.8, labs_text = FALSE, labs_lwd = 0.1, labs = fam$subpop, ylab = '', leg_title = "Coancestry")
par(mar = c(0.2, 0, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 0.8)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)
par(mar = c(0, 0.2, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 0.8)3 Estimating admixture proportions
The following chunk of the codes estimates the admixture proportions \(\boldsymbol{Q}\) from genotypes. We first estimate the individual specific allele frequencies \(\boldsymbol{\Pi}\) using the est_p_indiv function in the superadmixture package. We then estimate \(\boldsymbol{Q}\) by decomposing \(\boldsymbol{\Pi}\) with factor_p_indiv function in the superadmixture package. It takes hours to run the following codes, so we’re not running here.
for (k_antepops in 2:15) {
# estimate individual-specific allele frequencies
obj <- est_p_indiv(X, k_antepops, loci_on_cols = TRUE)
p_indiv <- obj$p_indiv
rowspace <- obj$rowspace
# estimate admix_props by decomposing individual-specific allele frequencies
obj <- factor_p_indiv(p_indiv = p_indiv,
rowspace = rowspace,
k_antepops = k_antepops,
init_method = "rowspace",
verbose = TRUE,
max_iters = 1000,
tol = 1e-4)
# save admixture proportions
Q_hat <- obj$Q_hat
dir.create(file.path("rdata", "Q_hat"), showWarnings = FALSE)
save(Q_hat, file = paste0("rdata/Q_hat/", k_antepops, ".rda"))
}4 Selecting number of antecedent populations
The following chunk of the codes calculates the p-values of structured Hardy-Weinberg Equilibrium test (sHWE) and plots the relationship between the negative entropy of the sHWE p-values and \(K\). It takes hours to run the following codes, so we’re not running here.
for (k_antepops in 2:15) {
# estimate `rowspace`, an input for sHWE function
rowspace <- est_p_indiv(X, k_antepops, loci_on_cols = TRUE, rowspace_only = TRUE)
# perform sHWE test
pvals <- sHWE(X, k_antepops, loci_on_cols = TRUE, rowspace = rowspace)
save(pvals, file = paste0("rdata/sHWE/", k_antepops, ".rda"))
}
neg_entropy <- c()
for (k_antepops in 2:15) {
load(paste0("rdata/sHWE/", k_antepops, ".rda"))
neg_entropy <- c(neg_entropy, calc_neg_entropy(pvals))
}
save(neg_entropy, file = "rdata/neg_entropy.rda")We visualize the relationship between \(K\) and the negative entropy of sHWE p-values.
load("rdata/neg_entropy.rda")
plot(2:15, neg_entropy, ylim = c(-2.2, -2.14), xlab = "", ylab = "", type = "b", family="Times New Roman")
mtext(latex2exp::TeX(r"(\textit{$K$})"), side = 1, col = "black", line = 2.5, family="Times New Roman", pch = 10)
mtext("Negative Entropy", side = 2, line = 2.5, col = "black", family="Times New Roman")We select \(K = 7\) according to this plot.
5 Estimating antecedent populations coancestry
After obtaining individual-level coancestry \(\boldsymbol{\Theta}\) and admixture proportions \(\boldsymbol{Q}\), we can use the function est_coanc to estimate population coancestry under the super admixture and standard admixture.
load("rdata/coanc_os.rda")
load("rdata/Q_hat/7.rda")
coanc_antepops_sup <- est_coanc(coanc_os, Q_hat, model = "super")
coanc_antepops_std <- est_coanc(coanc_os, Q_hat, model = "standard")We then compute the corresponding individual-level coancestry under the super admixture and standard admixture.
6 Visualization
We can visualize the coancestry of antecedent populations coanc_antepops_sup and admixture proportions Q_hat using the helper functions get_seq_colors, plot_tree, barplot_admix and heatmap_coanc_antepops we provide.
# reorder antecedent populations according to the value of the population inbredding
index <- order(diag(coanc_antepops_sup))
Q_hat <- Q_hat[index, ]
k_antepops <- 7
coanc_antepops_sup <- coanc_antepops_sup[index, index]
# label antecedent populations
colnames(coanc_antepops_sup) <- rownames(coanc_antepops_sup) <- paste0("S", 1:k_antepops)
# fit tree
tree <- bnpsd::fit_tree(round(coanc_antepops_sup, 6))
# plot an uncolorred tree
plot_tree(tree)Based on the topology of the tree, we decide to color the populations \(S_1\), \(S_2\) by light blue and dark blue, \(S_3\), \(S_4\), \(S_5\) by light green, medium green and dark green, \(S_6\) and \(S_7\) by light red and dark red. We can pick colors by using the get_seq_colors() function and its returned value can be used to specify the coloring scheme for plot_tree() function.
colors <- c(get_seq_colors("Blues", 2), get_seq_colors("Greens", 3), get_seq_colors("Reds", 2))
names(colors) <- paste0("S", 1:k_antepops)
fig_tree <- plot_tree(tree, colors = colors)
# flip tree
fig_tree <- ggtree::flip(fig_tree, get_parent_node("S6", tree), get_parent_node("S4", tree))
fig_tree <- ggtree::flip(fig_tree, get_current_node("S4", tree), get_parent_node("S3", tree))
fig_tree <- ggtree::scaleClade(fig_tree, get_parent_node("S6", tree), 2)
fig_tree <- ggtree::scaleClade(fig_tree, get_parent_node("S4", tree), 0.5)
fig_treeWe can visualize admixture proportions Q_hat using the barplot_admix function.
We also can visualize the coancestry among antecedent populations by heatmap_coanc_antepops. We define the helper function heatmap_coanc_antepops_wrapper that returns a ggplot object of the heatmap.
heatmap_coanc_antepops_wrapper <- function(coanc_antepops, tl.cex = 1.3, cl.cex = 1, tl.offset = 0.6) {
superadmixture::heatmap_coanc_antepops(coanc_antepops, tl.cex = tl.cex, cl.cex = cl.cex, tl.offset = tl.offset)
grab_grob <- function(){
gridGraphics::grid.echo()
grid::grid.grab()
}
p <- grab_grob()
# save correlation matrix colors to a vector, then make coloured matrix grob transparent
matrix.colors <- grid::getGrob(p, grid::gPath("square"), grep = TRUE)[["gp"]][["fill"]]
p <- grid::editGrob(p, grid::gPath("square"), grep = TRUE, gp = grid::gpar(col = NA, fill = NA))
# apply the saved colours to the underlying matrix grob
p <- grid::editGrob(p, grid::gPath("symbols-rect-1"), grep = TRUE, gp = grid::gpar(fill = matrix.colors))
# convert the background fill from white to transparent, while we are at it
p <- grid::editGrob(p, grid::gPath("background"), grep = TRUE, gp = grid::gpar(fill = NA))
p <- ggplotify::as.ggplot(p) + ggplot2::theme(plot.margin = unit(c(0, 0, 0, 0), "pt"))
return(p)
}par(xpd = TRUE)
fig_coancestry <- heatmap_coanc_antepops_wrapper(coanc_antepops_sup, tl.cex = 1.1, tl.offset = 1)fig_coancestry <- fig_coancestry + theme(plot.margin = margin(0, 0, 0, 0, "pt"))
coancestry_lab <- ggplot() +
annotate("text", x = 0.5, y = 0.5, size = 5, label = "Coancestry") +
xlim(0, 1) +
ylim(0, 1) +
theme_void()
fig_coancestry <- ggarrange(fig_coancestry, coancestry_lab, ncol = 1, heights = c(1, 0.1))We combine all plots.
# combine plots of tree, admix props, subpops and coancestry
design <- "
112
113
##3
"
fig_coancestry + fig_tree + fig_admix +
patchwork::plot_layout(
design = design,
widths = c(0.6, 0.2, 1.4),
heights = c(0.6, 0.2, 0.4)) +
patchwork::plot_annotation(tag_levels = 'A', tag_prefix = '(', tag_suffix = ')') &
ggplot2::theme(plot.tag = ggplot2::element_text(color = "black", size = 21, face = 'bold'))7 Simulating genotypes from the structure of IND study
We can simulate genotypes using the double-admixture method with the dbl_admixture function. It takes about an hour to run the following codes, so we’re not running here.
X <- as.matrix(X)
p_anc <- 0.5 * colMeans(X, na.rm = TRUE)
coanc_antepops <- round(coanc_antepops, 6)
X_sim <- dbl_admixture(p_anc, coanc_antepops, Q_hat, geno_only = TRUE)
## estimate kinship
obj <- popkin::popkin_A(X_sim)
A_min <- mean(sort(obj$A[lower.tri(obj$A)])[1:100])
kinship_sim_os <- 1 - obj$A / A_min
## mapping kinship coefficients to coancestry coefficients
coanc_sim_os <- popkin::inbr_diag(kinship_sim_os)
coanc_sim_os <- ifelse(coanc_sim_os < 0, 0, coanc_sim_os)
save(coanc_sim_os, file = "rdata/coanc_sim_os.rda")We can also use NORTA method to simulate genotypes. It takes a few hours to run the following codes, so we’re not running here.
X_sim_norta <- norta_approx(p_anc, coanc_antepops, Q_hat,
geno_only = TRUE, parallel = TRUE, method = "numeric", tol = 1e-5, mc_cores = 20)
## estimate kinship
obj <- popkin::popkin_A(X_sim_norta)
A_min <- mean(sort(obj$A[lower.tri(obj$A)])[1:100])
kinship_sim_os_norta <- 1 - obj$A / A_min
## mapping kinship coefficients to coancestry coefficients
coanc_sim_os_norta <- popkin::inbr_diag(kinship_sim_os_norta)
coanc_sim_os_norta <- ifelse(coanc_sim_os_norta < 0, 0, coanc_sim_os_norta)
save(coanc_sim_os_norta, file = "rdata/coanc_sim_os_norta.rda")We plot the individual-level coancestry.
load("rdata/coanc_sim_os.rda")
load("rdata/coanc_sim_os_norta.rda")
par(mar = c(0, 0, 0, 0) + 0.2)
layout(rbind(
c(14, 19, 15, 20, 16, 21, 0),
c( 7, 1, 0, 2, 0, 3, 6),
c( 0, 9, 0, 10, 0, 11, 0),
c(17, 22, 18, 23, 0, 0, 0),
c( 8, 4, 0, 5, 0, 0, 0),
c( 0, 12, 0, 13, 0, 0, 0)),
heights = c(0.2, 1, 0.22, 0.2, 1, 0.22),
widths = c(0.2, 1, 0.08, 1, 0.08, 1, 0.2))
coanc_sim_os <- (coanc_sim_os - 1) * (1 - min(coanc_sup)) + 1
coanc_sim_os_norta <- (coanc_sim_os_norta - 1) * (1 - min(coanc_sup)) + 1
# We truncate the large entries of `coanc_std`
# coanc_std <- ifelse(coanc_std > max(coanc_os), max(coanc_os), coanc_std)
popkin::plot_popkin(kinship = list(coanc_os, coanc_sup, coanc_std, coanc_sim_os, coanc_sim_os_norta),
layout_add = FALSE,
leg_cex = 0.8,
labs_text = FALSE,
labs_lwd = 0.1,
labs = fam$subpop,
ylab = '',
leg_title = "Coancestry",
panel_letters = NULL)
par(mar = c(0.5, 0.5, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 1.1)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)
par(mar = c(0.5, 0.5, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 1.1)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(A)", cex = 2, xpd = TRUE, font = 2)
par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(B)", cex = 2, xpd = TRUE, font = 2)
par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(C)", cex = 2, xpd = TRUE, font = 2)
par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(D)", cex = 2, xpd = TRUE, font = 2)
par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(E)", cex = 2, xpd = TRUE, font = 2)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof real data", cex = 1.6)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "Super admixture individual coancestry\n of real data", cex = 1.6)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "Standard admixture individual coancestry\n of real data", cex = 1.6)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof bootstrap data (double-admixture)", cex = 1.6)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof bootstrap data (NORTA)", cex = 1.6)8 Confirming significant hypothesis tests of standard admixture versus super admixture
We perform the significance test of coancestry among antecedent populations to compare the model fit between standard admixture versus super admixture. The following chunk of codes is used to generate the null distribution of the test statistics \(U\). It should be noted that it takes days to run this chunk of code. We submit each iteration as a separate cluster job to speed up (not show here). Therefore, we’re just showing the code but not running it. We provide the pre-computed null test statistics in the file rdata/test_stats0.rda.
for (task_id in 1:1000) {
## compute null test statistics
inbr_antepops <- diag(est_coanc(coanc_os, Q_hat, model = "standard"))
## compute null
test_stats0 <- compute_null(p_anc, Q_hat, inbr_antepops, verbose = TRUE)
# save results
dir.create(file.path("rdata", "test_stat0"), showWarnings = FALSE)
save(test_stat0, file = paste0("rdata/test_stat0/", task_id, ".rda"))
}
test_stats0 <- c()
for (task_id in 1:1000) {
load(paste0("rdata/test_stat0/", task_id, ".rda"))
test_stats0 <- c(test_stats0, test_stat0)
}
save(test_stats0, file = "rdata/test_stats0.rda")We plot the distribution of null test statistics and label our observed test statistics.
load("rdata/test_stats0.rda")
load("rdata/coanc_os.rda")
load("rdata/Q_hat/7.rda")
coanc_antepops_sup <- est_coanc(coanc_os, Q_hat, model = "super")
coanc_antepops_std <- est_coanc(coanc_os, Q_hat, model = "standard")
test_stat1 <- norm(coanc_antepops_sup - coanc_antepops_std, "F")
p <- data.frame(test_stats0 = test_stats0) %>%
ggplot(data, mapping = aes(x = test_stats0)) +
geom_histogram(aes(y =..density..),
fill = "dodgerblue3",
bins = 10,
binwidth = NULL,
color = "black",
alpha = 0.7,
size = 0.1) +
labs(x = latex2exp::TeX(r"(Null test-statistics)"), y='Density') +
scale_y_continuous(limits = c(0, 250)) +
theme_bw(base_size = 16) +
ggtitle("Indian study (IND)") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, size=16)) +
annotate("text", x = 0.051, y = 240, size = 6, label = latex2exp::TeX(sprintf("$U_{obs}$ = %.3f", test_stat1)))
p